# Built-in Imports
import pathlib
import statistics as stats
from datetime import datetime, date
# Third Party Imports
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import xlsxwriter
FalseSouth is in the transportation industry, and we help drivers book and find loads with brokers as well as take care of drivers’ back office operations. FalseSouth usually gets paid directly by brokers for the service of hauling loads. However, FalseSouth is having trouble assessing if our collections process has been successful and how much money we are receiving from the brokers. Therefore, the FalseSouth team has assigned to you the task of digging into the accounts receivable data to help with the collections.
Please address the following questions and deliverables:
General Note: Depending on the size and end state of a project, I would make the decision on whether to leave any utility functions (like the one below) "in-line" or move out to a local module. To demonstrate the workflow, and hopefully make the final document more interpretable, I am leaving them in place where utilized.
# Function I use often to speed up work on large local datasets. Overkill for this example, but adding for common workflow demonstration.
def get_and_pickle_data(xl_path, pkl_path, force_rerun=False):
# Check for data already already processed, and that a forced rerun is not desired. If both, read from here.
if (pkl_path.exists()) & (force_rerun == False):
df = pd.read_pickle(pkl_path)
else:
# If processed file did not exist or rerun desired, read from raw excel file.
df = pd.read_excel(xl_path, parse_dates=True)
df.to_pickle(pkl_path)
return df
invoices_raw_path = pathlib.Path('../data/raw/falsesouth_receivables.xlsx')
invoices_pkl_path = pathlib.Path('../data/processed/falsesouth_receivables.pkl')
invoice_df = get_and_pickle_data(invoices_raw_path, invoices_pkl_path)
# Convert invoice dataframe datetime columns to date. Timestamp is not useful, and detracts from reporting later.
invoice_df['DUE_DATE'] = pd.to_datetime(invoice_df['DUE_DATE']).dt.date
invoice_df['INVOICE_DATE'] = pd.to_datetime(invoice_df['INVOICE_DATE']).dt.date
General Note: At this point, I generally do a couple of basic checks. I want to make sure my data types are what I expect, I want to understand null/missing values, and I want to get a general feel for the numbers we are dealing with (basic stats). Oftentimes, these checks will raise other questions, and additional steps are taken as needed.
Data Quality Note: 30 Null Values in the LOAD_NUM column. For this exercise, I don't think this will effect the deliverables as we appear to have a good INVOICE_ID and no duplicates there. In a real world scenario, the load number can be quite important and will likely be requested by the broker if an overdue invoice is brought to their attention without it for reference. I would also be very curious if the the missing data were truly in sequential order as it is here too...I would want to know if it was a momentary lapse in data collection, or ensure that there is not a systemic problem.
# First check is for a glance at data types, and an understanding of completeness.
invoice_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307 entries, 0 to 306 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 COMPANY_NAME 307 non-null object 1 LOAD_NUM 277 non-null object 2 INVOICE_ID 307 non-null object 3 INVOICE_DATE 307 non-null object 4 DUE_DATE 307 non-null object 5 BALANCE 307 non-null float64 6 TOTAL_AMOUNT 307 non-null float64 dtypes: float64(2), object(5) memory usage: 16.9+ KB
# What do records with missing LOAD_NUM look like? Sequential nature would jump out in real scenario.
invoice_df[invoice_df['LOAD_NUM'].isna()]
| COMPANY_NAME | LOAD_NUM | INVOICE_ID | INVOICE_DATE | DUE_DATE | BALANCE | TOTAL_AMOUNT | |
|---|---|---|---|---|---|---|---|
| 70 | Loki Logisitcs Solution | NaN | 71 | 2021-06-14 | 2021-07-04 | 0.0 | 298.275 |
| 71 | Valkyries Global Logistics | NaN | 72 | 2021-06-16 | 2021-07-06 | 25.0 | 25.000 |
| 72 | Avengers and SHIELD Brokerage | NaN | 74 | 2021-06-18 | 2021-07-08 | 0.0 | 50.000 |
| 73 | Valkyries Global Logistics | NaN | 73 | 2021-06-18 | 2021-07-08 | 0.0 | 100.000 |
| 74 | 10 Rings Logistics Transport Inc | NaN | 75 | 2021-06-24 | 2021-07-14 | 0.0 | 440.000 |
| 75 | Avengers and SHIELD Brokerage | NaN | 77 | 2021-06-24 | 2021-07-14 | 0.0 | 330.000 |
| 76 | Starlord Logistics LLC | NaN | 76 | 2021-06-24 | 2021-07-14 | 0.0 | 80.000 |
| 77 | Catwoman BRokerage Inc | NaN | 78 | 2021-06-25 | 2021-07-15 | 0.0 | 210.000 |
| 78 | Kamar Taj Transport, LLC | NaN | 79 | 2021-06-28 | 2021-07-18 | 0.0 | 127.400 |
| 79 | Sanctum Sanctorum International Group, LLC | NaN | 80 | 2021-06-28 | 2021-07-18 | 0.0 | 375.000 |
| 80 | Dr. Strange Transport LLC | NaN | 81 | 2021-06-29 | 2021-07-19 | 150.0 | 150.000 |
| 81 | XMEN Logistics | NaN | 82 | 2021-06-29 | 2021-07-19 | 0.0 | 110.000 |
| 82 | Valkyries Global Logistics | NaN | 83 | 2021-07-06 | 2021-07-26 | 0.0 | 345.000 |
| 83 | Avengers Transport Group LLC | NaN | 84 | 2021-07-08 | 2021-07-28 | 0.0 | 130.000 |
| 84 | Captain Marvel Logistics | NaN | 85 | 2021-07-08 | 2021-07-28 | 0.0 | 70.000 |
| 85 | Avengers Transport Group LLC | NaN | 86 | 2021-07-09 | 2021-07-29 | 0.0 | 300.000 |
| 86 | Captain Marvel Logistics | NaN | 87 | 2021-07-09 | 2021-07-29 | 0.0 | 170.000 |
| 87 | Spiderman Logistics | NaN | 88 | 2021-07-12 | 2021-08-01 | 0.0 | 269.500 |
| 88 | Spiderman Logistics | NaN | 89 | 2021-07-12 | 2021-08-01 | 269.5 | 269.500 |
| 89 | Guardians of the Galaxy Transport Services | NaN | 90 | 2021-07-13 | 2021-08-02 | 610.0 | 610.000 |
| 90 | IRON MAN ENTERPRISE LOGISTICS | NaN | 91 | 2021-07-16 | 2021-08-05 | 140.0 | 140.000 |
| 91 | Sanctum Sanctorum International Group, LLC | NaN | 92 | 2021-07-19 | 2021-08-08 | 37.5 | 37.500 |
| 92 | Catwoman BRokerage Inc | NaN | 93 | 2021-07-20 | 2021-08-09 | 12.9 | 252.900 |
| 93 | Kamar Taj Transport, LLC | NaN | 94 | 2021-07-20 | 2021-08-09 | 0.0 | 122.990 |
| 94 | AVENGERS SUCCESS TRANS | NaN | 95 | 2021-07-22 | 2021-08-11 | 0.0 | 165.000 |
| 95 | CAPTAIN AMERICA TRANS | NaN | 96 | 2021-07-26 | 2021-08-15 | 0.0 | 355.000 |
| 96 | Wasp Logistics | NaN | 97 | 2021-07-27 | 2021-08-16 | 0.0 | 125.000 |
| 97 | Sanctum Sanctorum International Group, LLC | NaN | 99 | 2021-07-28 | 2021-08-17 | 0.0 | 222.500 |
| 98 | Starlord Logistics LLC | NaN | 98 | 2021-07-28 | 2021-08-17 | 0.0 | 180.000 |
| 99 | Avengers and SHIELD Brokerage | NaN | 100 | 2021-07-30 | 2021-08-19 | 0.0 | 100.000 |
Data Quality Note: When checking descriptive stats, I want to get a feel for the spread, but also potential upper and lower outliers.
# Second check is a quick peek at the descriptive statistics of our numeric fields. From this, we can see there are items with $0.00 invoiced? Intentional?
invoice_df.describe()
| BALANCE | TOTAL_AMOUNT | |
|---|---|---|
| count | 307.000000 | 307.000000 |
| mean | 29.142997 | 195.017599 |
| std | 82.551290 | 144.479487 |
| min | 0.000000 | 0.000000 |
| 25% | 0.000000 | 100.000000 |
| 50% | 0.000000 | 150.000000 |
| 75% | 0.000000 | 260.000000 |
| max | 610.000000 | 1000.000000 |
# Identify invoices with $0.00 amounts
invoice_df[invoice_df.TOTAL_AMOUNT == 0.0]
| COMPANY_NAME | LOAD_NUM | INVOICE_ID | INVOICE_DATE | DUE_DATE | BALANCE | TOTAL_AMOUNT | |
|---|---|---|---|---|---|---|---|
| 17 | Valkyries Global Logistics | 3000914005LFA | 18 | 2020-11-30 | 2020-12-14 | 0.0 | 0.0 |
| 58 | Guardians of the Galaxy Transport Services | 226656D | 59 | 2021-05-14 | 2021-06-13 | 0.0 | 0.0 |
General Note: Outside of the checks above, the COMPANY_NAME field jumped out at me immediately when reading in our data. Having dealt with human input and naming inconsistencies in just about every system I have ever worked in, seeing the capitalization differences, as well as the names that are VERY similar immediately raise flags for me.
Data Quality Note: After exploring the COMPANY_NAME occurrances, I am fairly confident there are some duplicate accounts (some examples below.) In a real world scenario, this would lead me back to the data source to search for an attribute that might help clarify...my first target would be an Account Number or Address to help differentiate. After that, maybe searching for a "Parent" or "Payer" level account number to see if a business hierarchy exists, as this is often the case within large organizations. It could be that each are actually seperate brokers under a parent account, and we want to report to this "Sold-To" level. Either way, it would warrant revisiting the data to understand, and likely an internal business decision to be made by stakeholders.
# A few examples that jumped out, and would be worth exploring for 'roll-up' summary reporting opportunities.
pd.DataFrame(invoice_df['COMPANY_NAME'][invoice_df['COMPANY_NAME'].str.contains('Avengers Transport|Agents of')
].value_counts()).reset_index().rename(columns={'index': 'COMPANY_NAME', 'COMPANY_NAME': 'COUNTS'}).sort_values(by='COMPANY_NAME')
| COMPANY_NAME | COUNTS | |
|---|---|---|
| 6 | Agents of S.H.I.E.L.D. | 1 |
| 1 | Agents of SHIELD | 4 |
| 0 | Avengers Transport Group LLC | 21 |
| 4 | Avengers Transport, INC | 1 |
| 2 | Avengers Transportation Group LLC | 4 |
| 3 | Avengers Transportation Logistics Services Inc | 2 |
| 5 | Avengers Transportation Systems | 1 |
# Function to get total invoiced
def get_total_invoiced(df):
total_invoiced = df['TOTAL_AMOUNT'].sum()
return total_invoiced
# Function to get total outstanding
def get_total_outstanding(df):
total_outstanding = df['BALANCE'].sum()
return total_outstanding
total_invoiced = get_total_invoiced(invoice_df)
print(f'Total Invoiced to all Brokers: ${total_invoiced:,.2f}')
Total Invoiced to all Brokers: $59,870.40
total_outstanding = get_total_outstanding(invoice_df)
total_collected = total_invoiced - total_outstanding
print(f'Total Collected: ${total_collected:,.2f}')
Total Collected: $50,923.50
print(f'Total Outstanding: ${total_outstanding:,.2f}')
Total Outstanding: $8,946.90
Calculated on 3/28/2022
# Function to get total BALANCE, where DUE_DATE is less than current datetime (Max date was 3/20/2022, so all are overdue)
def get_overdue_outstanding(df):
overdue_df = df[df['DUE_DATE'] < datetime.now().date()]
overdue_outstanding = overdue_df['BALANCE'].sum()
return overdue_outstanding
total_overdue_outstanding = get_overdue_outstanding(invoice_df)
print(f'Total Overdue Outstanding: ${total_overdue_outstanding:,.2f}')
Total Overdue Outstanding: $8,946.90
# Function to resample/aggregate dataset to monthly totals by INVOICE_DATE month.
def get_monthly_totals(df):
monthly_df = df.copy()
monthly_df['INVOICE_DATE'] = pd.to_datetime(monthly_df['INVOICE_DATE'])
monthly_df = monthly_df[['INVOICE_DATE', 'BALANCE', 'TOTAL_AMOUNT']].set_index('INVOICE_DATE').resample('BMS').sum()
monthly_df['COLLECTED'] = monthly_df['TOTAL_AMOUNT'] - monthly_df['BALANCE']
return monthly_df
monthly_totals = get_monthly_totals(invoice_df)
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add bar plot trace for monthly COLLECTED
fig.add_trace(
go.Bar(x=monthly_totals.index, y=monthly_totals['COLLECTED'], hovertemplate='<br>Month: %{x}<br>Amount: $%{y:,.2f}', name="Monthly Collected Totals"))
# Add bar plot trace for monthly UNCOLLECTED
fig.add_trace(
go.Bar(x=monthly_totals.index, y=monthly_totals['BALANCE'], hovertemplate='<br>Month: %{x}<br>Amount: $%{y:,.2f}', name="Monthly Uncollected Totals"))
# Add title, plot size, theme
fig.update_layout(
title_text="Monthly Invoice Totals - Collected vs. Uncollected",
height = 550,
width = 1450,
template = 'xgridoff',
barmode='stack')
fig.show(renderer='notebook')
invoice_med = np.median(invoice_df['TOTAL_AMOUNT'])
print(f'Median Invoice Amount: ${invoice_med:,.2f}')
Median Invoice Amount: $150.00
# Box plot to visualize invoice amount statistics. Hover for 5 number summary and outliers, or zoom in/out to explore.
fig = go.Figure()
fig.add_trace(go.Box(x=invoice_df['TOTAL_AMOUNT'], name = 'Invoiced Amount',
orientation='h', boxpoints='all', jitter=0.3))
fig.update_layout(
title_text="Distribution of Invoice Amounts (Hover for detailed descriptive statistics)",
height = 350,
width = 1350,
template = 'simple_white')
fig.show(renderer='notebook')
def get_invoice_counts(df):
inv_counts_df = pd.DataFrame(df.groupby('COMPANY_NAME')['INVOICE_ID'].count()
).reset_index().rename(columns={'INVOICE_ID': 'INVOICE_COUNT'})
return inv_counts_df
invoice_counts = get_invoice_counts(invoice_df)
print(f'Mean Invoice Count: {np.mean(invoice_counts.INVOICE_COUNT):,.2f}')
print(f'Median Invoice Count: {np.median(invoice_counts.INVOICE_COUNT):,.2f}')
print(f'Mode Invoice Count: {stats.mode(invoice_counts.INVOICE_COUNT):,.2f}')
Mean Invoice Count: 7.49 Median Invoice Count: 4.00 Mode Invoice Count: 1.00
General Note: When approaching any output deliverable, I try to put myself in the position of the person that is likely to consume it. The obvious use of a statement would be to have the ability to send it out to the customer. These could also be used internally to process and accelerate collections of accounts. Whether internal or external, it seemed to me that leaving no question in regards to the status of invoice items would be important. With that in mind, my thought was to clearly separate out the "Overdue Invoices" at the top so there is no ambiguity to the reader, and the remainder of "Paid Invoices" below that.
def get_broker_invoices(df, broker_name):
broker_items_df = df[df['COMPANY_NAME'] == broker_name]
return broker_items_df
def write_broker_reports(df):
'''
Function to auto-generate broker statements.
'''
# Get a sorted list of unique broker company names to iterate through
brokers = sorted(list(df.COMPANY_NAME.unique()))
# Iterate through brokers, pulling statement line items, and writing each to a separate Excel report
for broker in brokers:
# Get broker line items
broker_df = get_broker_invoices(df, broker)
broker_df = broker_df.sort_values(by='DUE_DATE', ascending=False)
broker_past_due = broker_df[(broker_df['DUE_DATE'] < datetime.now().date()) & (broker_df['BALANCE'] > 0)]
broker_current = broker_df[broker_df['BALANCE'] == 0]
# Establish path and file name
statement_path = pathlib.Path('../statements/' + broker + ' - Statement.xlsx')
# Instantiate xlsxwriter objects and variables to manipulate report.
sheetname = 'Statement'
writer = pd.ExcelWriter(statement_path, engine='xlsxwriter')
workbook = writer.book
start_row = 4
len_past_due = len(broker_past_due)
# If there are overdue invoices, write overdue line items first and then write historical paid items below.
if len(broker_past_due) > 0:
broker_past_due.to_excel(writer, sheet_name=sheetname, startrow=start_row, index=False)
worksheet = writer.sheets[sheetname]
broker_current.to_excel(writer, sheet_name=sheetname, startrow=start_row + len_past_due + start_row, index=False)
worksheet.write(0, 0, 'FalseSouth - Brokerage Account Statement', workbook.add_format({'bold': True, 'color': '#1d9385', 'size': 28}))
worksheet.write(1, 0, 'Broker: ' + broker, workbook.add_format({'bold': True, 'color': '#d3755c', 'size': 22}))
worksheet.write(3, 0, 'Overdue Invoices (As of: ' + datetime.now().strftime('%d %b %Y') + ')', workbook.add_format({'bold': True, 'color': '#B72D0F', 'size': 16}))
worksheet.write(start_row + len_past_due + start_row-1, 0, 'Paid Invoices (As of: ' + datetime.now().strftime('%d %b %Y') + ')', workbook.add_format({'bold': True, 'color': '#108B44', 'size': 16}))
# If no overdue items, just write all historical paid items.
else:
broker_df.to_excel(writer, sheet_name=sheetname, startrow = start_row, index=False)
worksheet = writer.sheets[sheetname]
worksheet.write(0, 0, broker + ' - Statement', workbook.add_format({'bold': True, 'color': '#1d9385', 'size': 28}))
worksheet.write(start_row - 1, 0, 'Paid Invoices (As of: ' + datetime.now().strftime('%d %b %Y') + ')', workbook.add_format({'bold': True, 'color': '#108B44', 'size': 16}))
# Adjust column widths for presentation.
worksheet.set_column(1, 6, 20)
worksheet.set_column(0, 0, 30)
# Save / Close writer
writer.save()
write_broker_reports(invoice_df)